#!/usr/bin/perl
print "seting ELC on /scratchfs\n";
$dis_elc=`/usr/sbin/lctl set_param ldlm.namespaces.scrafs*-mdc-*.early_lock_cancel=0`;
print "Result is $dis_elc\n";
$cmd_dir="/root/bin/ldlm_monitor";
@str=`ps -eo pid,cmd,pcpu |grep ldlm_bl_|grep -v grep`;
#@str=`ps -eo pid,cmd,pcpu`;
$thredhold=5;
$total_ldlm=$#str+1;
$pcpu=0;
for ($i=0;$i<=$total_ldlm;$i++){
	print $str[$i];
	my @res=split /\s+/,$str[$i];
	$pcpu=$pcpu+$res[2];
}
$avg=$pcpu/$total_ldlm;
$avg=sprintf("%0.2f",$avg);
$date=localtime();
$hostname=`hostname -s`;
$now=time();
$lastsend=`cat $cmd_dir/lastsend`;
@lsent=split /\s+/,$lastsend;
print "$date:the average cpu usage of $total_ldlm ldlm_bl_XX is $avg\n";
if ($avg>$thredhold){
	print "$date:ldlm_bl high cpu usage($avg) detected on $hostname\n";
	if ($lsent[0] eq "OK" ){
		`echo "FAILED $now 1" >$cmd_dir/lastsend`;
		 print "sending bad message\n";
		`$cmd_dir/SmsWarningClient.pl sms.ihep.ac.cn 123 "ldlm_bl avg($avg) cpu detected on $hostname" sms 13810758632`;
		`$cmd_dir/SmsWarningClient.pl sms.ihep.ac.cn 123 "ldlm_bl avg($avg) cpu detected on $hostname" sms 15701510767`;
	}else{
		my $failed_c=$lsent[2]+1;
		`echo "FAILED $now $failed_c" >$cmd_dir/lastsend`;
		if($failed_c>3){
			print "Failed more than 3 times on at $date:$now\n";
			`$cmd_dir/SmsWarningClient.pl sms.ihep.ac.cn 123 "ldlm_bl problem detected on $hostname 3+ times, reboot" sms 15701510767`;
			`$cmd_dir/SmsWarningClient.pl sms.ihep.ac.cn 123 "ldlm_bl problem detected on $hostname 3+ times, reboot" sms 13810758632`;
			print "I am goting to reboot it in 5 secs\n";
			sleep(5);
			`echo b >/proc/sysrq-trigger`;
		}

	}
}elsif ($lsent[0] ne "OK"){
	print "sending good message\n";
	`$cmd_dir/SmsWarningClient.pl sms.ihep.ac.cn 123 "$hostname back from high ldlm_bl" sms 13810758632`;
	`$cmd_dir/SmsWarningClient.pl sms.ihep.ac.cn 123 "$hostname back from high ldlm_bl" sms 15701510767`;
        `echo "OK $now">$cmd_dir/lastsend`;
}
